{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship'\n",
    "        ,'race','sex','capital-gain','capital-loss','hours-per-week','native-country','Salary']\n",
    "adult_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/adult.data.txt', names=cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 32561 entries, 0 to 32560\n",
      "Data columns (total 15 columns):\n",
      "age               32561 non-null int64\n",
      "workclass         32561 non-null object\n",
      "fnlwgt            32561 non-null int64\n",
      "education         32561 non-null object\n",
      "education-num     32561 non-null int64\n",
      "marital-status    32561 non-null object\n",
      "occupation        32561 non-null object\n",
      "relationship      32561 non-null object\n",
      "race              32561 non-null object\n",
      "sex               32561 non-null object\n",
      "capital-gain      32561 non-null int64\n",
      "capital-loss      32561 non-null int64\n",
      "hours-per-week    32561 non-null int64\n",
      "native-country    32561 non-null object\n",
      "Salary            32561 non-null object\n",
      "dtypes: int64(6), object(9)\n",
      "memory usage: 3.7+ MB\n"
     ]
    }
   ],
   "source": [
    "adult_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_cols_df = adult_data.select_dtypes('object')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in cat_cols_df.columns:\n",
    "    adult_data[col] = adult_data[col].str.strip().replace('?',np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 32561 entries, 0 to 32560\n",
      "Data columns (total 15 columns):\n",
      "age               32561 non-null int64\n",
      "workclass         30725 non-null object\n",
      "fnlwgt            32561 non-null int64\n",
      "education         32561 non-null object\n",
      "education-num     32561 non-null int64\n",
      "marital-status    32561 non-null object\n",
      "occupation        30718 non-null object\n",
      "relationship      32561 non-null object\n",
      "race              32561 non-null object\n",
      "sex               32561 non-null object\n",
      "capital-gain      32561 non-null int64\n",
      "capital-loss      32561 non-null int64\n",
      "hours-per-week    32561 non-null int64\n",
      "native-country    31978 non-null object\n",
      "Salary            32561 non-null object\n",
      "dtypes: int64(6), object(9)\n",
      "memory usage: 3.7+ MB\n"
     ]
    }
   ],
   "source": [
    "adult_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>workclass</th>\n",
       "      <th>education</th>\n",
       "      <th>marital-status</th>\n",
       "      <th>occupation</th>\n",
       "      <th>relationship</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>native-country</th>\n",
       "      <th>Salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>State-gov</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Private</td>\n",
       "      <td>11th</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Private</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          workclass  education      marital-status         occupation  \\\n",
       "0         State-gov  Bachelors       Never-married       Adm-clerical   \n",
       "1  Self-emp-not-inc  Bachelors  Married-civ-spouse    Exec-managerial   \n",
       "2           Private    HS-grad            Divorced  Handlers-cleaners   \n",
       "3           Private       11th  Married-civ-spouse  Handlers-cleaners   \n",
       "4           Private  Bachelors  Married-civ-spouse     Prof-specialty   \n",
       "\n",
       "    relationship   race     sex native-country Salary  \n",
       "0  Not-in-family  White    Male  United-States  <=50K  \n",
       "1        Husband  White    Male  United-States  <=50K  \n",
       "2  Not-in-family  White    Male  United-States  <=50K  \n",
       "3        Husband  Black    Male  United-States  <=50K  \n",
       "4           Wife  Black  Female           Cuba  <=50K  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult_data.select_dtypes('object').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_cols = list(cat_cols_df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_cols.remove('Salary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['workclass',\n",
       " 'education',\n",
       " 'marital-status',\n",
       " 'occupation',\n",
       " 'relationship',\n",
       " 'race',\n",
       " 'sex',\n",
       " 'native-country']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_cols = list(adult_data.select_dtypes('int64').columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['age',\n",
       " 'fnlwgt',\n",
       " 'education-num',\n",
       " 'capital-gain',\n",
       " 'capital-loss',\n",
       " 'hours-per-week']"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_num = Pipeline(steps=[\n",
    "    ('imputer', SimpleImputer(strategy='median')),\n",
    "    ('scaling',MinMaxScaler())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_cat = Pipeline(steps=[\n",
    "    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n",
    "    ('encoding', OneHotEncoder(handle_unknown='ignore'))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', pipeline_num, num_cols),\n",
    "        ('cat', pipeline_cat, cat_cols)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pipeline = Pipeline(steps=[('preprocessor',preprocessor),\n",
    "                ('classifier', MultinomialNB())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainX, testX, trainY, testY = train_test_split(adult_data, adult_data.Salary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n",
       "         transformer_weights=None,\n",
       "         transformers=[('num', Pipeline(memory=None,\n",
       "     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,\n",
       "       strategy='median', verbo..., 'native-country'])])), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.fit(trainX,trainY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7994103918437538"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.score(testX,testY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Salary': {24642: '<=50K'},\n",
       " 'age': {24642: 19},\n",
       " 'capital-gain': {24642: 0},\n",
       " 'capital-loss': {24642: 0},\n",
       " 'education': {24642: 'Some-college'},\n",
       " 'education-num': {24642: 10},\n",
       " 'fnlwgt': {24642: 181572},\n",
       " 'hours-per-week': {24642: 40},\n",
       " 'marital-status': {24642: 'Never-married'},\n",
       " 'native-country': {24642: 'United-States'},\n",
       " 'occupation': {24642: 'Adm-clerical'},\n",
       " 'race': {24642: 'White'},\n",
       " 'relationship': {24642: 'Own-child'},\n",
       " 'sex': {24642: 'Male'},\n",
       " 'workclass': {24642: 'Private'}}"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "testX[:1].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[9.99012164e-01, 9.87835734e-04]])"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.predict_proba(testX[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "{''}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
